Please feel free to contact Wanze Xie (WX: wanze.xie@childrens.harvard.edu) or Joe Bathelt (JB: JoeBathelt@gmail.com) for any questions.
"Esc" will take me into command mode where I can navigate around my notebook with arrow keys. Some useful shortcuts can be found here: https://www.dataquest.io/blog/jupyter-notebook-tips-tricks-shortcuts/
import warnings
warnings.filterwarnings("ignore")
import bct
import impyute
import itertools
import matplotlib.pylab as plt
import matplotlib.patches as patches
from matplotlib.colors import LinearSegmentedColormap
import matplotlib.patches as mpatches
import networkx as nx
import os
import pandas as pd
import numpy as np
import re
import seaborn as sns
from scipy.stats import chisquare, sem, ttest_ind, zscore
from scipy.spatial.distance import pdist, squareform
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.multitest import multipletests
The following codes are used by JB to set up the parameters and styles for plots and display (I guess).
sns.set_style('white')
from matplotlib import rcParams
rcParams['font.family'] = 'serif'
rcParams['font.serif'] = ['Computer Modern Unicode'] + plt.rcParams['font.serif']
rcParams['text.usetex'] = False
rcParams['axes.labelsize'] = 9
rcParams['xtick.labelsize'] = 9
rcParams['ytick.labelsize'] = 9
rcParams['legend.fontsize'] = 9
mm2inches = 0.039371
single_column = 90*mm2inches
double_column = 190*mm2inches
one_half_column = 140*mm2inches
Function defintions to be used further down. This includes the consus community clustering algorithm
def get_consensus_module_assignment(correlation_matrix):
# Create the agreement matrix with 50 initial repetitions
agreement_matrix = []
gamma = 1.0
repetitions = 50
for i in np.arange(0, repetitions):
# Thresholding weak connections
correlation_matrix = bct.threshold_absolute(correlation_matrix, 0.1)
# Community detection
modules, q = bct.modularity_louvain_und_sign(correlation_matrix, gamma=gamma)
# Fine-tuning the community solution
modules, _ = bct.modularity_finetune_und(correlation_matrix, modules, gamma=gamma)
# Creating the agreement matrix
module_matrix = np.repeat(modules, repeats=correlation_matrix.shape[0])
module_matrix = np.reshape(module_matrix, newshape=correlation_matrix.shape)
agreement_matrix.append(1.*(module_matrix == module_matrix.transpose()))
agreement_matrix = np.rollaxis(np.asarray(agreement_matrix), 0, 3)
agreement_matrix = np.sum(agreement_matrix, 2)/repetitions
# Create the consensus partition (Lancichinetti & Fortunato)
consensus_partition = bct.consensus_und(agreement_matrix, tau=0.1)
return(consensus_partition, q)
def plot_adjmat(sorted_corrmat, cluster_vector):
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(90 * mm2inches, 80 * mm2inches), dpi=72)
# Draw the heatmap with the mask and correct aspect ratio
sns.set_style('white')
sns.heatmap(sorted_corrmat, vmax=1.0, cbar=False, center=0, cmap='bwr',
square=True, cbar_kws={"shrink": .5})
ax.set_xticklabels(' ')
ax.set_yticklabels(' ')
# Create a Rectangle patch
for cluster in np.unique(cluster_vector):
x = 0 + np.where(sorted(cluster_vector) == cluster)[0][0]
y = len(cluster_vector) - np.where(sorted(cluster_vector) == cluster)[0][-1]
width = height = np.where(np.asarray(sorted(cluster_vector)) == cluster)[0][-1] - np.where(np.asarray(sorted(cluster_vector)) == cluster)[0][0]
rect = patches.Rectangle((x, y),
width,
height,
linewidth=0.5, edgecolor='k', facecolor='none')
# Add the patch to the Axes
ax.add_patch(rect)
plt.savefig('output.jpg')
plt.show()
loading the data
df = pd.read_csv('../data/raw/tempandquestions_all.csv')
IBQ_columns = [column for column in df.columns if re.search('IBQ', column)]
columns_to_use = np.hstack([['subj', 'Sex'], IBQ_columns])
print(columns_to_use.shape)
print(columns_to_use)
Delete the columns of IBQ_DOC (date) and IBQ_Complete (logical variable) WX manually removed those two columns from the data, so don't need to run the following.
columns_to_use = np.delete(columns_to_use,[2,3],0) print(columns_to_use.shape) print(columns_to_use)
df = df[columns_to_use].copy()
print(df.shape)
print(df)
# Get the age informatin from their subject IDs
df['Age'] = [int(df['subj'][i].split('a')[1].split('f')[0]) for i in np.arange(0, df.shape[0])]
print(df.columns)
# check if the data type of the IBQ variables is float.
# In the new csv file, the empty cells have a space in them, which resulted in the variables being read in as objects
# because they are a mix of strings (i.e., " ") and values
print(df['IBQ_Distress'].dtype)
dfbackup = df.copy()
print(dfbackup.shape)
# Delete the subjects with old IBQ questionnaires
df = dfbackup.copy()
df['old_IBQ'] = np.isnan(df['IBQ_Fall_React'].values) & np.isnan(df['IBQ_Cuddliness'].values) & np.isnan(df['IBQ_Percept_Sens'].values)
df = df[df['old_IBQ'] == False]
print(df.shape)
print(df)
Imputing missing values with Multivariate Imputation by Chained Equations (mice)
df[IBQ_columns] = impyute.mice(df[IBQ_columns].values, loop=5000)
print(df.shape)
# check if there is still missing value (nan) for each variable
df.isnull().any(axis=0)
This was JB's code to use "Expectation Maximization (EM)" for imputation.
df[IBQ_columns] = impyute.em(df[IBQ_columns].values, loop=5000)
Visualizing the distributions of the behavioural variables
IBQ_scales = [
'Approach',
'Vocal_React',
'High_Pleas',
'Smiling',
'Activity',
'Percept_Sens',
'Sadness',
'Distress',
'Fear',
'Fall_React',
'Low_Pleas',
'Cuddliness',
'Orienting',
'Soothability']
IBQ_columns = ['IBQ_' + column for column in IBQ_scales]
def community_clustering(df):
# Calculate Mahalanobis distance
distances = pdist(df.values, metric='euclidean')
dist_matrix = squareform(distances)
# Building the participant by participant network
corrmat = df.transpose().corr().values
# Running the community clustering
clusters, q = get_consensus_module_assignment(corrmat)
print('Q-index: q=%.2f' % q)
# Calculating the sorting array
sorting_array = sorted(range(len(clusters)), key=lambda k: clusters[k])
sorted_array = corrmat
sorted_array = sorted_array[sorting_array, :]
sorted_array = sorted_array[:, sorting_array]
# Plot
fig,(ax1, ax2) = plt.subplots(1, 2, figsize=(80*mm2inches,40*mm2inches), dpi=300)
ax1.imshow(corrmat, vmin=-1, vmax=1, cmap='bwr', aspect='auto')
ax1.axis('off')
ax2.imshow(sorted_array, vmin=-1, vmax=1, cmap='bwr', aspect='auto')
ax2.axis('off')
plt.show()
return clusters
def plot_profiles(df, clusters):
df['group'] = clusters
df['group'] = df['group'].astype('int')
means = df.groupby('group').mean().values
SE = 2*df.groupby('group').sem().values
colours = ["#E64B35B2", "#4DBBD5B2", "#00A087B2", "#3C5488B2", "#F39B7FB2", "#8491B4B2", "#91D1C2B2", "#DC0000B2", "#7E6148B2"]
colnames = [''.join(IBQ_column.split('_')[1:]) for IBQ_column in IBQ_columns]
plt.figure(figsize=(50*mm2inches, 50*mm2inches), dpi=300)
plt.errorbar(np.arange(0*0.1, means.shape[1]), means[0, :], SE[0, :], color=colours[0], linewidth=1,
marker='o', markersize=2, capsize=2, elinewidth=1, markeredgewidth=2)
plt.errorbar(np.arange(1*0.1, means.shape[1]), means[1, :], SE[1, :], color=colours[1], linewidth=1,
marker='o', markersize=2, capsize=2, elinewidth=1, markeredgewidth=2)
if len(np.unique(clusters)) > 2:
plt.errorbar(np.arange(2*0.1, means.shape[1]), means[2, :], SE[2, :], color=colours[2], linewidth=1,
marker='o', markersize=2, capsize=2, elinewidth=1, markeredgewidth=2)
else:
print('There are two clusters')
# plt.errorbar(np.arange(2*0.1, means.shape[1]), means[2, :], SE[2, :], color=colours[2], linewidth=1,
# marker='o', markersize=2, capsize=2, elinewidth=1, markeredgewidth=2)
labels = ['C' + str(i) for i in np.arange(1, len(np.unique(df['group']))+1)]
plt.legend(labels, bbox_to_anchor=(1.1, 1.05))
plt.ylim([-2, 2])
plt.axhline(0, color='k', linewidth=1, alpha=0.8)
plt.axhline(-1, color='k', linestyle='dashed', linewidth=1, alpha=0.8)
plt.axhline(+1, color='k', linestyle='dashed', linewidth=1, alpha=0.8)
ax = plt.gca()
ax.set_xticks(np.arange(0, len(colnames)))
ax.set_xticklabels(colnames, rotation=90)
# set the distance between tick labels and axes
ax.tick_params(axis='both', which='major', pad=0)
# WX changed the font of the labels because they appear to be crowded on his monitor.
indexes = range(1,len(IBQ_scales)+1,1)
for tick, index in zip(ax.xaxis.get_major_ticks(),indexes):
tick.label.set_fontsize(6)
if index<=6:
tick.label.set_color('navy')
elif index >= 11:
tick.label.set_color('b')
else:
tick.label.set_color('magenta')
Community clustering on the original data. It is a test for the programs, but it does not take care of age and sex that both should have an impact on child temperament.
temp_df = df[np.hstack(['subj', IBQ_columns, 'Age'])].copy()
temp_df[IBQ_columns] = temp_df[IBQ_columns].copy().apply(zscore)
clusters = community_clustering(temp_df[IBQ_columns])
plot_profiles(temp_df[IBQ_columns], clusters)
Calculate the residuals for each IBQ variables with the age being the regressor
temp_df = df[np.hstack(['subj', IBQ_columns, 'Age','Sex'])].copy()
for column in IBQ_columns:
temp_df[column] = smf.ols(column + '~ C(Age)', data=temp_df).fit().resid.values
temp_df[IBQ_columns] = temp_df[IBQ_columns].copy().apply(zscore)
Do community detection with the "age-regressed" data
clusters = community_clustering(temp_df[IBQ_columns])
plot_profiles(temp_df[IBQ_columns], clusters)
Save the clustering results to a .csv file
temp_df['clusters_1yrs'] = clusters
temp_df[['subj', 'Age','Sex','clusters_1yrs']].to_csv('../data/derived/clusters_1yrs_AgeRegressed.csv')
# boy(0)
temp_df = df[np.hstack(['subj', IBQ_columns, 'Age','Sex'])].copy()
temp_df_b = temp_df[temp_df['Sex'].isin([0])].copy()
for column in IBQ_columns:
temp_df_b[column] = smf.ols(column + '~ C(Age)', data=temp_df_b).fit().resid.values
temp_df_b[IBQ_columns] = temp_df_b[IBQ_columns].copy().apply(zscore)
print(temp_df_b.shape)
# temp_df_b = temp_df[temp_df['Sex'].isin([0])].copy()
# print(temp_df_b.shape)
clusters = community_clustering(temp_df_b[IBQ_columns])
plot_profiles(temp_df_b[IBQ_columns], clusters)
# save the output
temp_df_b['clusters_1yrs'] = clusters
temp_df_b[['subj', 'Age','Sex','clusters_1yrs']].to_csv('../data/derived/clusters_1yrs_AgeRegressed_boys.csv')
# girl(1)
temp_df = df[np.hstack(['subj', IBQ_columns, 'Age','Sex'])].copy()
temp_df_g = temp_df[temp_df['Sex'].isin([1])].copy()
for column in IBQ_columns:
temp_df_g[column] = smf.ols(column + '~ C(Age)', data=temp_df_g).fit().resid.values
temp_df_g[IBQ_columns] = temp_df_g[IBQ_columns].copy().apply(zscore)
print(temp_df_g.shape)
clusters = community_clustering(temp_df_g[IBQ_columns])
plot_profiles(temp_df_g[IBQ_columns], clusters)
# save the output
temp_df_g['clusters_1yrs'] = clusters
temp_df_g[['subj', 'Age','Sex','clusters_1yrs']].to_csv('../data/derived/clusters_1yrs_AgeRegressed_girls.csv')
define the ECBQ variables
EBQ_scales = [
'Impulsive',
'Activity',
'High_Pleas',
'Sociability',
'Pos_Anticipate',
'Discomfort',
'Fear',
'Motor',
'Sadness',
'Percept_Sens',
'Shyness',
'Soothability',
'Frustration',
'Inhib_Cntrl',
'Attn_Shift',
'Low_Pleas',
'Cuddliness',
'Attn_Focus'
]
read in the ECBQ data for 2 yrs
df = pd.read_csv('../data/raw/TempAndQuestions_all.csv')
EBQ_columns = ['ECBQ_' + column + '_2yr' for column in EBQ_scales]
columns_to_use = np.hstack([['subj', 'Sex'], EBQ_columns])
# columns_to_use = np.hstack([['subj'], EBQ_columns])
df = df[columns_to_use].copy()
print('Total included: %i' % len(df))
print(df.shape)
checking missing data
df['missing'] = df[EBQ_columns].isnull().sum(axis=1) > 2 # if missing more than 2 ECBQ variables then no interpolation
print('Number of cases with missing values: %i' % sum(df['missing']))
df = df[df['missing'] == False]
print('Total included: %i' % len(df))
print(df.shape)
Interpolation the missing values
df[EBQ_columns] = impyute.mice(df[EBQ_columns].values, loop=5000)
# df = df.dropna()
print(df)
# print(df[EBQ_columns])
# check if there is still missing value (nan) for each variable
df.isnull().any(axis=0)
Community Detection Modify the plot program to change IBQ to EBQ
def plot_profiles(df, clusters):
df['group'] = clusters
df['group'] = df['group'].astype('int')
means = df.groupby('group').mean().values
SE = 2*df.groupby('group').sem().values
colours = ["#E64B35B2", "#4DBBD5B2", "#00A087B2", "#3C5488B2", "#F39B7FB2", "#8491B4B2", "#91D1C2B2", "#DC0000B2", "#7E6148B2"]
# colnames = [''.join(EBQ_column.split('_')[1:]) for EBQ_column in EBQ_columns]
colnames= EBQ_scales.copy()
plt.figure(figsize=(50*mm2inches, 50*mm2inches), dpi=300)
plt.errorbar(np.arange(0*0.1, means.shape[1]), means[0, :], SE[0, :], color=colours[0], linewidth=1,
marker='o', markersize=2, capsize=2, elinewidth=1, markeredgewidth=2)
plt.errorbar(np.arange(1*0.1, means.shape[1]), means[1, :], SE[1, :], color=colours[1], linewidth=1,
marker='o', markersize=2, capsize=2, elinewidth=1, markeredgewidth=2)
if len(np.unique(clusters)) > 2:
plt.errorbar(np.arange(2*0.1, means.shape[1]), means[2, :], SE[2, :], color=colours[2], linewidth=1,
marker='o', markersize=2, capsize=2, elinewidth=1, markeredgewidth=2)
else:
print('There are two clusters')
labels = ['C' + str(i) for i in np.arange(1, len(np.unique(df['group']))+1)]
plt.legend(labels, bbox_to_anchor=(1.1, 1.05))
plt.ylim([-2, 2])
plt.axhline(0, color='k', linewidth=1, alpha=0.8)
plt.axhline(-1, color='k', linestyle='dashed', linewidth=1, alpha=0.8)
plt.axhline(+1, color='k', linestyle='dashed', linewidth=1, alpha=0.8)
ax = plt.gca()
ax.set_xticks(np.arange(0, len(colnames), 1.02))
ax.set_xticklabels(colnames, rotation=90)
# set the distance between tick labels and axes
ax.tick_params(axis='both', which='major', pad=0)
# WX changed the font of the labels because they appear to be crowded on his monitor. Does this change with the screen size?
indexes = range(1,len(EBQ_scales)+1,1)
for tick, index in zip(ax.xaxis.get_major_ticks(),indexes):
tick.label.set_fontsize(6)
if index<=5:
tick.label.set_color('navy')
elif index >= 14:
tick.label.set_color('b')
else:
tick.label.set_color('magenta')
# Community clustering on the data
# temp_df = df[np.hstack(['subj', EBQ_columns, 'Age'])].copy()
temp_df = df.copy();
temp_df[EBQ_columns] = temp_df[EBQ_columns].copy().apply(zscore)
clusters = community_clustering(temp_df[EBQ_columns])
nclusters = np.unique(clusters)
print('Total clusters: %i' % len(nclusters))
# Plots
plot_profiles(temp_df[EBQ_columns], clusters)
# save the output
temp_df['clusters_2yrs'] = clusters
temp_df[['subj','Sex','clusters_2yrs']].to_csv('../data/derived/clusters_2yrs.csv')
boys (Sex = 0)
temp_df_b = df[df['Sex'].isin([0])].copy()
print(temp_df_b.shape)
Clustering and plotting for boys only
temp_df_b[EBQ_columns] = temp_df_b[EBQ_columns].copy().apply(zscore)
clusters = community_clustering(temp_df_b[EBQ_columns])
plot_profiles(temp_df_b[EBQ_columns], clusters)
# save the output
temp_df_b['clusters_2yrs'] = clusters
temp_df_b[['subj','Sex','clusters_2yrs']].to_csv('../data/derived/clusters_2yrs_boys.csv')
girls (Sex = 1)
temp_df_g = df[df['Sex'].isin([1])].copy()
print(temp_df_g.shape)
Clustering and plotting for gilrs only
temp_df_g[EBQ_columns] = temp_df_g[EBQ_columns].copy().apply(zscore)
clusters = community_clustering(temp_df_g[EBQ_columns])
plot_profiles(temp_df_g[EBQ_columns], clusters)
# save the output
temp_df_g['clusters_2yrs'] = clusters
temp_df_g[['subj','Sex','clusters_2yrs']].to_csv('../data/derived/clusters_2yrs_girls.csv')
Read in 3 yrs ECBQ data
df = pd.read_csv('../data/raw/TempAndQuestions_all.csv')
EBQ_columns = ['ECBQ_' + column + '_3yr' for column in EBQ_scales]
# columns_to_use = np.hstack([['subj', 'Sex'], EBQ_columns])
columns_to_use = np.hstack([['subj', 'Sex'], EBQ_columns])
df = df[columns_to_use].copy()
print('Total included: %i' % len(df))
print(df.shape)
checking missing data
df['missing'] = df[EBQ_columns].isnull().sum(axis=1) > 2 # if missing more than 2 ECBQ variables then no interpolation
print('Number of cases with missing values: %i' % sum(df['missing']))
df = df[df['missing'] == False]
print('Total included: %i' % len(df))
print(df.shape)
df[EBQ_columns] = impyute.mice(df[EBQ_columns].values, loop=5000)
#df = df.dropna()
print(df)
# Community clustering on the data
# temp_df = df[np.hstack(['subj', EBQ_columns, 'Age'])].copy()
temp_df = df.copy();
temp_df[EBQ_columns] = temp_df[EBQ_columns].copy().apply(zscore)
clusters = community_clustering(temp_df[EBQ_columns])
# Plots
plot_profiles(temp_df[EBQ_columns], clusters)
# save the output
temp_df['clusters_3yrs'] = clusters
temp_df[['subj','Sex','clusters_3yrs']].to_csv('../data/derived/clusters_3yrs.csv')
boys (Sex = 0)
temp_df_b = df[df['Sex'].isin([0])].copy()
print(temp_df_b.shape)
Clustering and plotting for boys only
temp_df_b[EBQ_columns] = temp_df_b[EBQ_columns].copy().apply(zscore)
clusters = community_clustering(temp_df_b[EBQ_columns])
plot_profiles(temp_df_b[EBQ_columns], clusters)
# save the output
temp_df_b['clusters_3yrs'] = clusters
temp_df_b[['subj','Sex','clusters_3yrs']].to_csv('../data/derived/clusters_3yrs_boys.csv')
girls (Sex = 1)
temp_df_g = df[df['Sex'].isin([1])].copy()
print(temp_df_g.shape)
Clustering and plotting for gilrs only
temp_df_g[EBQ_columns] = temp_df_g[EBQ_columns].copy().apply(zscore)
clusters = community_clustering(temp_df_g[EBQ_columns])
plot_profiles(temp_df_g[EBQ_columns], clusters)
# save the output
temp_df_g['clusters_3yrs'] = clusters
temp_df_g[['subj','Sex','clusters_3yrs']].to_csv('../data/derived/clusters_3yrs_girls.csv')